library(ggplot2)
library(fivethirtyeight)
There are many ways to “say” the same thing in ggplot2
ggplot(data = bechdel) +
geom_point(mapping = aes(x = budget, y = domgross)) +
geom_smooth(mapping = aes(x= budget, y = domgross))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 17 rows containing non-finite values (stat_smooth).
## Warning: Removed 17 rows containing missing values (geom_point).
ggplot(data = bechdel, aes(x= budget, y = domgross)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 17 rows containing non-finite values (stat_smooth).
## Warning: Removed 17 rows containing missing values (geom_point).
ggplot(bechdel) +
geom_point(aes(x = budget, y = domgross))
## Warning: Removed 17 rows containing missing values (geom_point).
ggplot(bechdel, aes(x = budget, y = domgross)) +
geom_point()
## Warning: Removed 17 rows containing missing values (geom_point).
ggplot(bechdel, aes(x= budget)) +
geom_point(aes(y = domgross))
## Warning: Removed 17 rows containing missing values (geom_point).
Override the global mappings
ggplot(data = bechdel, aes(x= budget, y = domgross)) +
geom_point(aes(color=clean_test)) +
geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 17 rows containing non-finite values (stat_smooth).
## Warning: Removed 17 rows containing missing values (geom_point).
To use the dplyr package, we need to load it.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Let’s make a toy dataset to play with.
beatles <- data.frame(
name = c("John", "Paul", "George", "Ringo"),
birth = c(1940, 1942, 1943, 1940),
instrument = c("guitar", "bass", "guitar", "drums")
)
Select is a way to extract columns from dataframes
select(beatles, name, birth)
Select just the instrument column
select(beatles, instrument)
beatles[2,3]
## [1] bass
## Levels: bass drums guitar
beatles[ ,"birth"]
## [1] 1940 1942 1943 1940
beatles[ ,c("name","birth")]
beatles$birth
## [1] 1940 1942 1943 1940
Select just the instrument column using brackets
beatles[ , "instrument"]
## [1] guitar bass guitar drums
## Levels: bass drums guitar
Select just the instrument column using a dollar sign.
beatles$instrument
## [1] guitar bass guitar drums
## Levels: bass drums guitar
filter(beatles, name == "George")
x <- c(1, 2, 3, 4, 5)
x > 3
## [1] FALSE FALSE FALSE TRUE TRUE
x >= 3
## [1] FALSE FALSE TRUE TRUE TRUE
x < 3
## [1] TRUE TRUE FALSE FALSE FALSE
x <= 3
## [1] TRUE TRUE TRUE FALSE FALSE
x == 3
## [1] FALSE FALSE TRUE FALSE FALSE
x != 3
## [1] TRUE TRUE FALSE TRUE TRUE
x = 3
filter(beatles, birth==1940)
filter(beatles, birth==1940, instrument == "guitar")
filter(beatles, birth==1940 & instrument == "guitar")
beatles[c(FALSE,TRUE,TRUE,FALSE), ]
beatles[beatles$birth == 1940, ]
Names with at least 5 uses in a particular year, from the Social Security Administration.
library(babynames)
Do you remember how to view a dataset?
filter(babynames, prop >= 0.08)
filter(babynames, name == "Sea")
filter(babynames, is.na(n))
filter(babynames, name == "Sea", sex == "F")
filter(babynames, n == 5 | n == 6, year == 1880)
filter(babynames, name %in% c("Acura", "Lexus", "Yugo"))
filter(babynames, name %in% c("Acura", "Lexus", "Yugo"))
carnames <- filter(babynames, name %in% c("Acura", "Lexus", "Yugo"))
babynames <- filter(babynames, name %in% c("Acura", "Lexus", "Yugo")) # dangerous!
rm(babynames)
str(babynames) #phew!
## Classes 'tbl_df', 'tbl' and 'data.frame': 1858689 obs. of 5 variables:
## $ year: num 1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
## $ sex : chr "F" "F" "F" "F" ...
## $ name: chr "Mary" "Anna" "Emma" "Elizabeth" ...
## $ n : int 7065 2604 2003 1939 1746 1578 1472 1414 1320 1288 ...
## $ prop: num 0.0724 0.0267 0.0205 0.0199 0.0179 ...
Try to: - Filter out the babynames data for your name. (You may want to also filter for your gender) - Assign that data its own name - Use your subsetted data to create a ggplot graphic of the popularity of your name over time
amelia <- filter(babynames, name=="Amelia" & sex == "F")
ggplot(amelia) + geom_point(aes(x=year, y=n))
ggplot(amelia) + geom_line(aes(x=year, y=n))
boys_2015 <- filter(babynames, year == 2015, sex == "M")
boys_2015 <- select(boys_2015, name, n)
boys_2015 <- arrange(boys_2015, desc(n))
boys_2015
Nested nonsense
arrange(select(filter(babynames, year == 2015,
sex == "M"), name, n), desc(n))
The %>% operator is the pipe. We can reorganized our code like so:
babynames %>%
filter(year == 2015, sex == "M") %>%
select(name, n) %>%
arrange(desc(n))
[Tip: type Cmd Shift M to insert a pipe]
Use %>% to write a sequence of functions that: - Filter babynames to just the girls that were born in 2015 - Select the name and n columns - Arrange the results so that the most popular names are near the top.
babynames %>%
filter(sex=="F", year == 2015) %>%
select(n, name) %>%
arrange(desc(n))
boys_2015 <- filter(babynames, year == 2015, sex == "M")
boys_2015 <- select(boys_2015, name, n)
boys_2015 <- arrange(boys_2015, desc(n))
boys_2015
Nested nonsense
arrange(select(filter(babynames, year == 2015,
sex == "M"), name, n), desc(n))
The %>% operator is the pipe. We can reorganized our code like so:
babynames %>%
filter(year == 2015, sex == "M") %>%
select(name, n) %>%
arrange(desc(n))
[Tip: type Cmd Shift M to insert a pipe]
Use %>% to write a sequence of functions that: - Filter babynames to just the girls that were born in 2015 - Select the name and n columns - Arrange the results so that the most popular names are near the top.
babynames %>%
filter(sex=="F", year == 2015) %>%
select(n, name) %>%
arrange(desc(n))
Just the weird ones
diamonds %>%
filter(y > 30 | x < 2)
Not the weird ones
diamonds %>%
filter(y < 30 | x > 2) %>%
ggplot() +
geom_point(aes(x = x, y = y))
Our syntax is getting weird! Note the %>% in the beginning, and + in the ggplot. Need to be careful